#install.packages("JGL")
#install.packages("JointNets")
#install.packages("tidyverse")
library(JGL)
## Loading required package: igraph
## Warning: package 'igraph' was built under R version 4.1.2
##
## Attaching package: 'igraph'
## The following objects are masked from 'package:stats':
##
## decompose, spectrum
## The following object is masked from 'package:base':
##
## union
library(JointNets) # for plotting JGL results
## Loading required package: lpSolve
## Loading required package: pcaPP
## Loading required package: parallel
## Registered S3 method overwritten by 'JointNets':
## method from
## plot.jgl JGL
##
## Attaching package: 'JointNets'
## The following object is masked from 'package:JGL':
##
## plot.jgl
## The following object is masked from 'package:stats':
##
## BIC
library(igraph)
library(tidyverse)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.1 ──
## ✓ ggplot2 3.3.5 ✓ purrr 0.3.4
## ✓ tibble 3.1.6 ✓ dplyr 1.0.7
## ✓ tidyr 1.1.4 ✓ stringr 1.4.0
## ✓ readr 2.1.1 ✓ forcats 0.5.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## x dplyr::as_data_frame() masks tibble::as_data_frame(), igraph::as_data_frame()
## x purrr::compose() masks igraph::compose()
## x tidyr::crossing() masks igraph::crossing()
## x dplyr::filter() masks stats::filter()
## x dplyr::groups() masks igraph::groups()
## x dplyr::lag() masks stats::lag()
## x purrr::simplify() masks igraph::simplify()
#source(here::here("code/helper_functions.R"))
source(here::here("code/visualize.r"))
##
## Attaching package: 'scales'
## The following object is masked from 'package:purrr':
##
## discard
## The following object is masked from 'package:readr':
##
## col_factor
theme_set(theme_bw())
# read in data from Raji
dat <- read.csv(here::here("data/hapo_metabolomics_2020.csv"))
The data consists of id, K = 4 ancestry groups (n = 400 each), and metabolites (p = 51 features). There is also a variable “fpg” which is fasting plasma glucose, I will omit that for now.
Let’s first prepare the data by centering and scaling each metabolite feature (within class).
summary(as.factor(dat$anc_gp))
## ag1 ag2 ag3 ag4
## 400 400 400 400
We can see that the raw data has different distributions, but assumptions of JGL need data to be centered and scaled.
head(dat)[,1:10]
## id anc_gp fpg mt1_1 mt1_2 mt1_3 mt1_4 mt1_5 mt1_6
## 1 hm0001 ag3 75.6 218.2223 76.99525 19.06366 14.23091 86.75162 135.2109
## 2 hm0002 ag3 84.6 292.6314 136.41320 43.14854 17.77549 120.17344 213.6531
## 3 hm0003 ag4 79.2 361.1135 79.98370 22.15848 13.05497 74.75441 136.1587
## 4 hm0004 ag3 73.8 274.0428 79.76154 19.72682 12.68744 69.34854 146.7262
## 5 hm0005 ag1 91.8 270.9049 88.98260 39.20949 15.73498 95.56857 145.2876
## 6 hm0006 ag4 73.8 271.4034 99.63992 27.81137 20.91344 95.55926 223.7438
## mt1_7
## 1 64.00578
## 2 91.30156
## 3 83.67878
## 4 72.67280
## 5 48.49431
## 6 68.94172
summary(dat$mt1_1)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 115.6 281.8 322.4 328.5 370.6 660.4
summary(dat$mt2_1)
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## -4.631 -3.341 -3.040 -3.044 -2.750 -1.154 1
summary(dat$mt3_1)
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 17.28 21.39 22.64 22.96 24.64 27.25 11
hist(dat$mt1_1)
hist(dat$mt2_1)
hist(dat$mt3_1)
# prepare data into list of K datasets
# make data into long format then center and scale by metabolite and group.
dat_long <- dat %>%
select(-fpg) %>%
pivot_longer(cols = starts_with("mt"),
names_to = "metabolite",
values_to = "value",
values_drop_na = TRUE) %>%
group_by(anc_gp, metabolite) %>%
mutate(scaled_value = scale(value, center = TRUE, scale = TRUE)) %>%
ungroup()
dat_long %>%
filter(metabolite %in% c("mt1_1", "mt2_1", "mt3_1")) %>%
ggplot(aes(x=scaled_value, color=anc_gp)) +
geom_density()+
facet_grid(metabolite ~.)
Now it’s ready! But first we need to get it into K n by p matrices as required by JGL package.
?? Should we do any imputation?
# make list of ancestry groups
ancestry_groups <- sort(unique(dat$anc_gp))
# filter by ancestry group, then make wide matrix of scaled values
# create a list of the results
dat_mat_list <- map(ancestry_groups,
~dat_long %>%
filter(anc_gp == .x) %>%
select(-c(anc_gp, value)) %>%
pivot_wider(names_from = metabolite,
values_from = scaled_value,
#for simplicity putting 0 as missing values. Normally should do imputation.
values_fill = 0) %>%
select(-id) %>%
as.matrix())
names(dat_mat_list) <- ancestry_groups
str(dat_mat_list)
## List of 4
## $ ag1: num [1:400, 1:51] -1.154 -0.675 -1.182 1.211 0.6 ...
## ..- attr(*, "dimnames")=List of 2
## .. ..$ : NULL
## .. ..$ : chr [1:51] "mt1_1" "mt1_2" "mt1_3" "mt1_4" ...
## $ ag2: num [1:400, 1:51] -0.145 0.379 -1.474 -1.313 -0.897 ...
## ..- attr(*, "dimnames")=List of 2
## .. ..$ : NULL
## .. ..$ : chr [1:51] "mt1_1" "mt1_2" "mt1_3" "mt1_4" ...
## $ ag3: num [1:400, 1:51] -1.269 0.329 -0.07 1.317 0.259 ...
## ..- attr(*, "dimnames")=List of 2
## .. ..$ : NULL
## .. ..$ : chr [1:51] "mt1_1" "mt1_2" "mt1_3" "mt1_4" ...
## $ ag4: num [1:400, 1:51] 0.119 -1.257 0.131 -0.346 -0.541 ...
## ..- attr(*, "dimnames")=List of 2
## .. ..$ : NULL
## .. ..$ : chr [1:51] "mt1_1" "mt1_2" "mt1_3" "mt1_4" ...
So now we have a list of 4 matrices, each with 400 standardized observations of 51 metabolites.
We can get to joint graphical lasso now!
fgl_results = JGL(Y = dat_mat_list,
penalty = "fused",
lambda1 = .08,
lambda2 = 0.02,
return.whole.theta = FALSE)
str(fgl_results)
## List of 3
## $ theta :List of 4
## ..$ : num [1:51, 1:51] 1.37 0 0 0 0 ...
## .. ..- attr(*, "dimnames")=List of 2
## .. .. ..$ : chr [1:51] "mt1_1" "mt1_2" "mt1_3" "mt1_4" ...
## .. .. ..$ : chr [1:51] "mt1_1" "mt1_2" "mt1_3" "mt1_4" ...
## ..$ : num [1:51, 1:51] 1.37 0 0 0 0 ...
## .. ..- attr(*, "dimnames")=List of 2
## .. .. ..$ : chr [1:51] "mt1_1" "mt1_2" "mt1_3" "mt1_4" ...
## .. .. ..$ : chr [1:51] "mt1_1" "mt1_2" "mt1_3" "mt1_4" ...
## ..$ : num [1:51, 1:51] 1.37 0 0 0 0 ...
## .. ..- attr(*, "dimnames")=List of 2
## .. .. ..$ : chr [1:51] "mt1_1" "mt1_2" "mt1_3" "mt1_4" ...
## .. .. ..$ : chr [1:51] "mt1_1" "mt1_2" "mt1_3" "mt1_4" ...
## ..$ : num [1:51, 1:51] 1.37 0 0 0 0 ...
## .. ..- attr(*, "dimnames")=List of 2
## .. .. ..$ : chr [1:51] "mt1_1" "mt1_2" "mt1_3" "mt1_4" ...
## .. .. ..$ : chr [1:51] "mt1_1" "mt1_2" "mt1_3" "mt1_4" ...
## $ theta.unconnected: NULL
## $ connected : logi [1:51] TRUE TRUE TRUE TRUE TRUE TRUE ...
## - attr(*, "class")= chr "jgl"
print.jgl(fgl_results)
##
## Number of connected nodes: 51
## Number of subnetworks in each class: 1 1 1 1
## Number of edges in each class: 294 300 272 292
## Number of edges shared by all classes: 214
## L1 norm of off-diagonal elements of classes' Thetas: 75.25031 75.50375 72.97726 72.85025
# extract all estimated covariance matrices from result
inv_covar_matrices <- fgl_results$theta
names(inv_covar_matrices) <- ancestry_groups
#now use function from igraph to create igraph graphs from adjacency matrices
graph_list <- map(ancestry_groups,
~graph_from_adjacency_matrix(
-cov2cor(inv_covar_matrices[[.x]]),
weighted = T,
mode = "undirected",
diag = FALSE
))
names(graph_list) <- ancestry_groups
plot.igraph(graph_list[["ag1"]],
layout = layout.fruchterman.reingold)
plot_jgl(graph_list[[1]], multiplier = 3)
plot_jgl(graph_list[[2]], multiplier = 3)
plot_jgl(graph_list[[3]], multiplier = 3)
plot_jgl(graph_list[[4]], multiplier = 3)
Now run ggl
## run ggl:
ggl_results = JGL(Y=dat_mat_list,
penalty="group",
lambda1=.15,
lambda2=.2,
return.whole.theta=TRUE)
str(ggl_results)
## List of 2
## $ theta :List of 4
## ..$ : num [1:51, 1:51] 1.09 0 0 0 0 ...
## .. ..- attr(*, "dimnames")=List of 2
## .. .. ..$ : chr [1:51] "mt1_1" "mt1_2" "mt1_3" "mt1_4" ...
## .. .. ..$ : chr [1:51] "mt1_1" "mt1_2" "mt1_3" "mt1_4" ...
## ..$ : num [1:51, 1:51] 1.05 0 0 0 0 ...
## .. ..- attr(*, "dimnames")=List of 2
## .. .. ..$ : chr [1:51] "mt1_1" "mt1_2" "mt1_3" "mt1_4" ...
## .. .. ..$ : chr [1:51] "mt1_1" "mt1_2" "mt1_3" "mt1_4" ...
## ..$ : num [1:51, 1:51] 1.1 0 0 0 0 ...
## .. ..- attr(*, "dimnames")=List of 2
## .. .. ..$ : chr [1:51] "mt1_1" "mt1_2" "mt1_3" "mt1_4" ...
## .. .. ..$ : chr [1:51] "mt1_1" "mt1_2" "mt1_3" "mt1_4" ...
## ..$ : num [1:51, 1:51] 1.1 0 0 0 0 ...
## .. ..- attr(*, "dimnames")=List of 2
## .. .. ..$ : chr [1:51] "mt1_1" "mt1_2" "mt1_3" "mt1_4" ...
## .. .. ..$ : chr [1:51] "mt1_1" "mt1_2" "mt1_3" "mt1_4" ...
## $ connected: logi [1:51] TRUE TRUE TRUE TRUE TRUE TRUE ...
## - attr(*, "class")= chr "jgl"
print.jgl(ggl_results)
##
## Number of connected nodes: 48
## Number of subnetworks in each class: 1 1 1 1
## Number of edges in each class: 146 146 146 155
## Number of edges shared by all classes: 130
## L1 norm of off-diagonal elements of classes' Thetas: 36.99183 34.28299 37.9018 35.52136
# extract all estimated covariance matrices from result
ggl_inv_covar_matrices <- ggl_results$theta
names(ggl_inv_covar_matrices) <- ancestry_groups
#now use function from igraph to create igraph graphs from adjacency matrices
ggl_graph_list <- map(ancestry_groups,
~graph_from_adjacency_matrix(
-cov2cor(ggl_inv_covar_matrices[[.x]]),
weighted = T,
mode = "undirected",
diag = FALSE
))
names(ggl_graph_list) <- ancestry_groups
plot_jgl(ggl_graph_list[[1]], multiplier = 3)
plot_jgl(ggl_graph_list[[2]], multiplier = 3)
plot_jgl(ggl_graph_list[[3]], multiplier = 3)
plot_jgl(ggl_graph_list[[4]], multiplier = 3)
BUT what we’ve done so far we’ve had to pre-select the lambdas.
Let’s instead do a